In [1]:
import pkg_resources
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
from deepforest.gcForest import GCForest

In [3]:
def split_x_y(dataframe, target):
    """
    Little helper to split X and y from a dataframe
    """
    return dataframe.drop(target, axis=1), dataframe[target]

Importing Data


In [4]:
raw_data = pd.read_csv(pkg_resources.resource_stream('deepforest', 'data/train.csv'))

In [5]:
raw_data.head()


Out[5]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [6]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)
clean_data = pd.get_dummies(clean_data).fillna(-1)
clean_data = pd.get_dummies(clean_data).fillna(-1)

train, test = train_test_split(clean_data)

X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")

Defining Model Generator


In [7]:
def paper_like_models():
    """
    As in the paper, each layer is composed of two "classic" random forest, 
    and two complete-random forests.
    """
    models = []
    for i in range(2):
        models.append(RandomForestClassifier(n_estimators=1000,
                                             n_jobs=-1,
                                             min_samples_leaf=10))
    for i in range(2):
        models.append(RandomForestClassifier(n_estimators=1000,
                                             n_jobs=-1,
                                             max_features=1,
                                             min_samples_leaf=10))
    return models

In [8]:
def models_generator():
    while True:
        yield paper_like_models()

Defining gcForest Model


In [9]:
gcForest = GCForest(models_generator(), metric=roc_auc_score)

In [10]:
%time gcForest.grow(X_train, y_train, X_test, y_test)


CPU times: user 18.1 s, sys: 3.24 s, total: 21.3 s
Wall time: 18.7 s

In [11]:
predictions = gcForest.predict_proba(X_test)

In [12]:
roc_auc_score(y_test, predictions[:, 1])


Out[12]:
0.88299200799200783

In [13]:
gcForest.levels


Out[13]:
1

In [ ]: